#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'moxuandeng'

from scrapy.spiders import Spider

from tutorial.items import DmozItem


class DmozSpider(Spider):
    name = "dmoz"
    allowed_domains = "dmoz.org"
    start_urls = [
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
    ]

    def parse(self, response):
        # filename = response.url.split("/")[-2] + '.html'
        # open(filename, 'wb').write(response.body)
        sites = response.xpath('//ul[@class="directory-url"]/li')

        for site in sites:
            item = DmozItem()
            item['title'] = site.xpath('a/text()').extract()
            item['link'] = site.xpath('a/@href').extract()
            item['desc'] = site.xpath('text()').re('-\s[^\n]*\\r|-\s[^\r]*\\n')
            yield item
